library(stm)
library(quanteda)

attach(COVID_Data_June_27_Control)


#CONTROL AND CONSERVATIVE

attach(COVID_Data_June_27_Control)

data2<- COVID_Data_June_27_Control


###################### Q22 ################################################################################################################################

# prepare data
data2 <- corpus(COVID_Data_June_27_Control, text_field = 'Q22')

docvars(data2)$text <- texts(data2)
data2 <- dfm(data2, stem = TRUE, remove = c(stopwords(source = "smart"),"pandemic", "virus", "mask", "wear"),
            remove_punct = TRUE) %>% dfm_trim(min_termfreq = 28)


out2 <- convert(data2, to = 'stm')

summary(out2)
##############How many topics? ##########################################

set.seed(4)
kResult2 <- searchK(out2$documents, out2$vocab, K = c(5, 7, 10),
                    init.type = "Spectral", prevalence =~ Conservative, data = out2$meta)

plot(kResult2)
par(mfrow = c(1, 1),mar = c(5, 5, 5, 5))
plot(kResult2$results$semcoh, kResult2$results$exclus, xlab = "Semantic Coherence",
     ylab = "Exclusivity")

text(kResult2$results$semcoh, kResult2$results$exclus, labels = paste("K",
                                                                      kResult2$results$K), pos = 1)

knitr::kable(kResult2$results)

#fit models and effect estimates


data_6 <- stm(documents = out2$documents,
              vocab = out2$vocab,
              data = out2$meta,
              prevalence = ~ Conservative,
              K = 5, verbose = FALSE)
summary(data_6)


prep_6 <- estimateEffect(1:5 ~ Conservative, data_6,
                         meta = out2$meta, set.seed(4))

labelTopics(data_6, topics = NULL, n = 10)

par(mfrow = c(1, 1),mar = c(7, 1,3, .5))
plot(data_6, xlim=c(0,.7), n = 7, labeltype = "frex", topic.names = c("(1) President's Response (General)", "(2) Federal Government's Response", "(3) President's Response (Negative)", "(4) Governors' Response", "(5) States' Response"), custom.labels = "", text.cex = .6)


summary(prep_6)

##################################################################
##################################################################
##################################################################
# Jason's additions

# Topic probabilities/prevalence are the 7th elements of the list
# created by the "stm" function
data_6_prevalence_matrix = data_6[[7]]

# Histograms of Topic probabilities by topic
par(mfrow=c(3,2))
hist(data_6_prevalence_matrix[,1],main = "Topic 1", xlab = "Probability" )
hist(data_6_prevalence_matrix[,2],main = "Topic 2", xlab = "Probability")
hist(data_6_prevalence_matrix[,3],main = "Topic 3", xlab = "Probability")
hist(data_6_prevalence_matrix[,4],main = "Topic 4", xlab = "Probability")
hist(data_6_prevalence_matrix[,5],main = "Topic 5", xlab = "Probability")

# Histogram of maximum topic probability this is the probability
# that is used to classify the document as a single topic
data_6_max_topic = c()
for(i in 1:dim(data_6_prevalence_matrix)[1]){
  data_6_max_topic[i] = max(data_6_prevalence_matrix[i,])
}

hist(data_6_max_topic,
     main= "Distribution of Max Topic Probabilities",
     xlab = "Probability")



###################### PLOTS #################################

par(mfrow = c(2, 3),mar = c(2.5, 1, 3, .5))
plot(prep_6, covariate="Conservative", topics=c(1), model=data_6, 
     method="pointestimate", cov.value1="0", cov.value2="1",
     xlab="", main="President Response (General) \n B = -0.02, SE = 0.02, p = 0.07",cex.main=.90, cex.lab=1.5, cex.axis=0.65,
     xlim=c(-.1,.3), labeltype ="custom", custom.labels=c('Conservative', 'Not Conservative'))

plot(prep_6, covariate="Conservative", topics=c(2), model=data_6, 
     method="pointestimate", cov.value1="0", cov.value2="1",
     xlab="", main="Federal Government's Response \n B = -0.07, SE = 0.01, p < 0.00",cex.main=.90, cex.lab=1.5, cex.axis=0.65,
     xlim=c(-.1,.3), labeltype ="custom", custom.labels=c('Conservative', 'Not Conservative'))

plot(prep_6, covariate="Conservative", topics=c(3), model=data_6, 
     method="pointestimate", cov.value1="0", cov.value2="1",
     xlab="", main="President's Response (Negative) \n B = -0.10, SE = 0.01, p < 0.00",cex.main=.90, cex.lab=1.5, cex.axis=0.65,
     xlim=c(-.25,.4), labeltype ="custom", custom.labels=c('Conservative', 'Not Conservative'))

plot(prep_6, covariate="Conservative", topics=c(4), model=data_6, 
     method="pointestimate", cov.value1="0", cov.value2="1",
     xlab="", main="Governor's Response \n B = 0.06, SE = 0.01, p < 0.00",cex.main=.90, cex.lab=1.5, cex.axis=0.65,
     xlim=c(-.25,.3), labeltype ="custom", custom.labels=c('Conservative', 'Not Conservative'))

plot(prep_6, covariate="Conservative", topics=c(5), model=data_6, 
     method="pointestimate", cov.value1="0", cov.value2="1",
     xlab="", main="State's Response \n B = 0.14, SE = 0.03, p < 0.00",cex.main=.90, cex.lab=1.5, cex.axis=0.65,
     xlim=c(-.25,.3), labeltype ="custom", custom.labels=c('Conservative', 'Not Conservative'))



###################Find Thoughts



attach(COVID_Data_June_27_Control)

data9r <- corpus(COVID_Data_June_27_Control, text_field = 'Q22')

docvars(data9r)$text <- texts(data9r)
data9r <- dfm(data9r, stem = TRUE, remove = c(stopwords(source = "smart"),"pandemic", "virus", "respond",  "mask", "wear", "spread"),
              remove_punct = TRUE) %>% dfm_trim(min_termfreq = 25)

out<-convert(data9r, to = 'stm')


model <- stm(out$documents, out$vocab, K = 5, max.em.its = 150, data = out$meta,
             init.type = "Spectral", prevalence =~ Conservative)

summary(model)


thoughts1<- findThoughts(model, texts = out$meta$Q22, topics = 5, n = 15)

head(thoughts1)


